Leveraging Social Media Sentiment Analysis for Real-Time Agricultural Market Trend Forecasting¶
In [13]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import nltk, re, warnings
warnings.filterwarnings('ignore')
nltk.download('vader_lexicon')
# Load dataset
df = pd.read_csv(r"C:\Users\malot\OneDrive\Desktop\Big data\Agri_BigData_3000.csv")
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')
print("✅ Dataset Loaded Successfully with", df.shape[0], "rows")
df.head()
✅ Dataset Loaded Successfully with 3000 rows
[nltk_data] Downloading package vader_lexicon to [nltk_data] C:\Users\malot\AppData\Roaming\nltk_data... [nltk_data] Package vader_lexicon is already up-to-date!
Out[13]:
| Region | Crop | Emotion | Post_Text | Date | Sentiment_Score | Price | Demand_Index | Temperature | Rainfall | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Gujarat | Wheat | Positive | wilt | 2025-03-04 | 0 | 4562.48 | 136 | 34.81 | 10.91 |
| 1 | Maharashtra | Pulses | Positive | pest attack | 2025-01-24 | 1 | 4731.54 | 63 | 31.22 | 14.32 |
| 2 | Gujarat | Pulses | Happy | borer | 2025-05-31 | 1 | 5221.20 | 157 | 36.12 | 13.96 |
| 3 | Telangana | Sugarcane | Neutral | blight | 2025-03-28 | 0 | 3715.47 | 95 | 36.95 | 12.07 |
| 4 | Punjab | Rice | Angry | healthy crop | 2025-02-01 | -1 | 9636.10 | 110 | 25.86 | 12.57 |
USE CASE 1 — Early Warning System for Pest & Disease Outbreaks¶
In [15]:
# Step 1: Detect pest/disease-related posts from Post_Text
pest_keywords = ['pest', 'attack', 'borer', 'blight', 'wilt', 'fungus', 'disease', 'infestation']
def detect_pest_risk(text):
text = str(text).lower()
return 1 if any(word in text for word in pest_keywords) else 0
df['Pest_Alert'] = df['Post_Text'].apply(detect_pest_risk)
# Step 2: Create Risk Score using Sentiment & Pest Mentions
df['Risk_Score'] = df['Pest_Alert'] * (1 - ((df['Sentiment_Score'] + 1) / 2)) * 100
# Step 3: Categorize Alerts
def risk_label(score):
if score >= 60: return 'High'
elif score >= 30: return 'Moderate'
else: return 'Low'
df['Alert_Level'] = df['Risk_Score'].apply(risk_label)
# Step 4: Visualization
fig = px.bar(df.groupby('Region')['Risk_Score'].mean().reset_index(),
x='Region', y='Risk_Score', color='Risk_Score',
color_continuous_scale='Reds',
title='🌾 Average Pest/Disease Risk by Region')
fig.show()
# Step 5: WordCloud for high-risk posts
pest_text = " ".join(df[df['Alert_Level'] == "High"]['Post_Text'])
wc = WordCloud(width=900, height=400, background_color='white').generate(pest_text)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title("Pest/Disease Mentions — High Risk Posts")
plt.show()
USE CASE 2 — Real-Time Commodity Price Nowcasting¶
In [19]:
features = ['Sentiment_Score', 'Demand_Index', 'Temperature', 'Rainfall']
X = df[features]
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
lr = LinearRegression().fit(X_train, y_train)
df['Predicted_Price'] = lr.predict(X)
rmse = np.sqrt(mean_squared_error(y_test, lr.predict(X_test)))
r2 = r2_score(y_test, lr.predict(X_test))
print(f"📊 Price Nowcast - RMSE: {round(rmse,2)} | R²: {round(r2,3)}")
fig = px.scatter(df, x='Date', y='Price', color='Crop',
title='Actual vs Predicted Commodity Prices',
hover_data=['Region'])
fig.add_traces(px.line(df, x='Date', y='Predicted_Price', color='Crop').data)
fig.show()
📊 Price Nowcast - RMSE: 2073.63 | R²: 0.012
USE CASE 3 — Farmer Sentiment Monitoring¶
In [22]:
# Step 1: Sentiment Labeling
def sentiment_label(x):
if x > 0: return 'Positive'
elif x < 0: return 'Negative'
else: return 'Neutral'
df['Sentiment_Label'] = df['Sentiment_Score'].apply(sentiment_label)
# Step 2: Visualization - Sentiment Distribution
fig = px.histogram(df, x='Sentiment_Label', color='Region', barmode='group',
title="Farmer Sentiment Distribution by Region")
fig.show()
# Step 3: Sentiment Trend Over Time
sent_trend = df.groupby(df['Date'].dt.to_period('M'))['Sentiment_Score'].mean().reset_index()
sent_trend['Date'] = sent_trend['Date'].astype(str)
fig = px.line(sent_trend, x='Date', y='Sentiment_Score', title='Monthly Sentiment Trend')
fig.update_yaxes(range=[-1, 1])
fig.show()
# Step 4: WordCloud for Emotions
emotion_text = " ".join(df['Emotion'].astype(str))
wc = WordCloud(width=900, height=400, background_color='white').generate(emotion_text)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title("Farmer Emotions WordCloud")
plt.show()
USE CASE 4 — Market Demand Prediction¶
In [25]:
features_d = ['Price', 'Sentiment_Score', 'Rainfall', 'Temperature']
X = df[features_d]
y = df['Demand_Index']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
rf = RandomForestRegressor(n_estimators=200, random_state=42).fit(X_train, y_train)
df['Predicted_Demand'] = rf.predict(X)
rmse = np.sqrt(mean_squared_error(y_test, rf.predict(X_test)))
r2 = r2_score(y_test, rf.predict(X_test))
print(f"🔮 Demand Prediction - RMSE: {round(rmse,2)} | R²: {round(r2,3)}")
# Feature Importance
imp = pd.Series(rf.feature_importances_, index=features_d).sort_values(ascending=True)
fig = px.bar(imp, x=imp.values, y=imp.index, orientation='h',
title='Feature Importance for Market Demand Prediction')
fig.show()
🔮 Demand Prediction - RMSE: 31.45 | R²: -0.068
USE CASE 5 — Regional Alert & Recommendation System¶
In [28]:
region_summary = df.groupby('Region').agg(
avg_sentiment=('Sentiment_Score','mean'),
avg_rainfall=('Rainfall','mean'),
avg_temp=('Temperature','mean'),
avg_price=('Price','mean'),
avg_demand=('Demand_Index','mean'),
high_risk_posts=('Pest_Alert','sum')
).reset_index()
def recommendation(row):
if row['high_risk_posts'] > 5: return "🚨 High pest risk — monitor immediately"
elif row['avg_sentiment'] < 0: return "😟 Negative sentiment — provide farmer support"
elif row['avg_demand'] > 120: return "📈 High demand — plan supply chain"
elif row['avg_rainfall'] < 10: return "💧 Low rainfall — irrigation advisory"
else: return "✅ Stable region"
region_summary['Recommendation'] = region_summary.apply(recommendation, axis=1)
fig = px.bar(region_summary, x='Region', y='avg_sentiment', color='Recommendation',
title='Regional Sentiment & Recommendations')
fig.show()
region_summary[['Region', 'Recommendation']]
Out[28]:
| Region | Recommendation | |
|---|---|---|
| 0 | Andhra Pradesh | 🚨 High pest risk — monitor immediately |
| 1 | Gujarat | 🚨 High pest risk — monitor immediately |
| 2 | Karnataka | 🚨 High pest risk — monitor immediately |
| 3 | Maharashtra | 🚨 High pest risk — monitor immediately |
| 4 | Punjab | 🚨 High pest risk — monitor immediately |
| 5 | Tamil Nadu | 🚨 High pest risk — monitor immediately |
| 6 | Telangana | 🚨 High pest risk — monitor immediately |
In [ ]: